For excerises 1-4, use the Yelp business json file. For exercises 5-6, use the Yelp review json file.
In [2]:
import pandas as pd
import numpy as np
# read json, lines=True because each line is a single json
business = pd.read_json('Yelp/yelp_academic_dataset_business.json', lines=True)
print(business.shape)
business.head()
Out[2]:
In [9]:
# extract five consecutive digits after a white space at the end of the address
business['zipcode'] = business['full_address'].str.extract('(\s\d{5}$)', expand=True)
business[['zipcode', 'full_address']].tail()
Out[9]:
In [10]:
# create a set containing all the possible categories
cat = set([])
for row in business['categories']:
if 'Restaurants' in row:
cat = cat.union(set(row))
cat = cat.difference(set(['Restaurants']))
In [11]:
# function that creates a dictionary based on the previous categories and sets the values to 1 if the category is presenti in the argument
def cat_dict(row):
if 'Restaurants' in row:
d = dict.fromkeys(cat, 0)
for e in row:
if e != 'Restaurants':
d[e] = 1
return d
else:
return np.NaN
In [12]:
business['Restaurant_type'] = business['categories'].apply(cat_dict)
In [13]:
business[['Restaurant_type', 'categories']].head()
Out[13]:
The reason we modify categorical variables like this is that machine learning algorithms cannot interpret textual data like "True" and "False". They need numerical inputs such as 1 and 0.
In [14]:
# if the value is a boolean replaces the spaces in the key with underscores and returns 0-1 instead of True-False
# else replaces the spaces in the key with underscores, adds the value to the key and returns 1
def replace_entry(key, value):
if type(value) is bool:
if value:
return key.replace(' ', '_'), 1
else:
return key.replace(' ', '_'), 0
else:
return key.replace(' ', '_') + '_' + str(value).replace(' ', '_'), 1
# returns the dictionary after applying the function above, navigates through the dictionary inside the values
def clean_dict(dct):
out_dct = {}
for key in dct:
value = dct[key]
if type(value) is dict:
for subkey in value:
new_key, new_value = replace_entry(subkey, value[subkey])
out_dct[key.replace(' ', '_') + '_' + new_key] = new_value
else:
new_key, new_value = replace_entry(key, value)
out_dct[new_key] = new_value
return out_dct
business['attributes'] = business['attributes'].apply(clean_dict)
In [15]:
business[['attributes']].head(10)
Out[15]:
In [16]:
# dct = {'Friday': {'close': '21:00', 'open': '11:00'},
# 'Monday': {'close': '21:00', 'open': '11:00'},
# 'Thursday': {'close': '21:00', 'open': '11:00'},
# 'Tuesday': {'close': '21:00', 'open': '11:00'},
# 'Wednesday': {'close': '21:00', 'open': '11:00'}}
dct = {'Friday': {'close': '02:00', 'open': '11:00'},
'Monday': {'close': '02:00', 'open': '11:00'},
'Saturday': {'close': '02:00', 'open': '12:00'},
'Sunday': {'close': '22:00', 'open': '12:00'},
'Thursday': {'close': '02:00', 'open': '11:00'},
'Tuesday': {'close': '02:00', 'open': '11:00'},
'Wednesday': {'close': '02:00', 'open': '11:00'}}
day_map = {'Friday': '2017-08-04',
'Monday': '2017-07-31',
'Saturday': '2017-08-05',
'Sunday': '2017-08-06',
'Thursday': '2017-08-03',
'Tuesday': '2017-08-01',
'Wednesday': '2017-08-02'}
In [17]:
def daily_hours(dct):
# initialize output
hours = dict.fromkeys(range(0, 7), 0)
for day in dct:
# set start datetime, end datetime and end of day datetime
start = pd.to_datetime(day_map[day] + ' ' + dct[day]['open'])
end = pd.to_datetime(day_map[day] + ' ' + dct[day]['close'])
end_of_day = pd.to_datetime(day_map[day] + ' 00:00:00') + pd.Timedelta(1, unit='D')
# if closing hours is in the next day add one to the end and add to output the two days
if dct[day]['close'] < dct[day]['open']:
end = end + pd.Timedelta(1, unit='D')
diff = end_of_day - start
hours[start.dayofweek] += int(np.floor(diff.seconds / 3600))
diff = end - end_of_day
hours[end_of_day.dayofweek] += int(np.floor(diff.seconds / 3600))
# else add only the current day
else:
diff = end - start
hours[start.dayofweek] += int(np.floor(diff.seconds / 3600))
return hours[0], hours[1], hours[2], hours[3], hours[4], hours[5], hours[6]
business['Monday'], business['Tuesday'], business['Wednesday'], business['Thursday'], business['Friday'], business['Saturday'], business['Sunday'] = zip(*business['hours'].map(daily_hours))
#daily_hours(dct)
In [8]:
# solution code:
FMT = '%H:%M'
def Time(row, date):
try:
closing = pd.datetime.strptime(row['hours'][date]['close'], FMT)
opening = pd.datetime.strptime(row['hours'][date]['open'], FMT)
time = closing - opening
if time < pd.Timedelta(days=0):
time += pd.Timedelta(days=1)
except:
time = pd.Timedelta(days=0)
return time
business.apply(lambda x: Time(x, 'Monday'), axis=1).head(10)
Out[8]:
In [18]:
business[['hours', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']].head(10)
Out[18]:
In [19]:
review = pd.read_json('Yelp/yelp_academic_dataset_review.json', lines=True)
print(review.shape)
review.head()
Out[19]:
In [20]:
average_review = review[['business_id', 'stars']].groupby('business_id').mean()
In [21]:
average_review.head()
Out[21]:
In [27]:
df = pd.merge(business[['business_id', 'name' , 'Restaurant_type', 'Friday', 'Saturday', 'attributes', 'zipcode']], average_review, left_on='business_id', right_index=True, how='inner')
df.drop('business_id', axis=1, inplace=True)
names = ['Business_Name', 'Restaurant_type', 'Friday hours', 'Saturday hours', 'Attributes', 'Zipcode', 'Average Rating']
df.columns = names
#remove restaurants
df = df[df['Restaurant_type'] == df['Restaurant_type']]
df.head()
Out[27]: